kreuzberg 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -2
- kreuzberg/_constants.py +6 -0
- kreuzberg/_html.py +32 -0
- kreuzberg/_mime_types.py +109 -1
- kreuzberg/_pandoc.py +122 -169
- kreuzberg/_pdf.py +189 -0
- kreuzberg/_pptx.py +88 -0
- kreuzberg/_string.py +5 -8
- kreuzberg/_sync.py +6 -1
- kreuzberg/_tesseract.py +98 -71
- kreuzberg/_tmp.py +37 -0
- kreuzberg/_types.py +71 -0
- kreuzberg/_xlsx.py +92 -0
- kreuzberg/extraction.py +269 -64
- kreuzberg-2.0.0.dist-info/METADATA +419 -0
- kreuzberg-2.0.0.dist-info/RECORD +21 -0
- kreuzberg/_extractors.py +0 -280
- kreuzberg-1.7.0.dist-info/METADATA +0 -342
- kreuzberg-1.7.0.dist-info/RECORD +0 -15
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from ._types import ExtractionResult, Metadata
|
2
|
+
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
|
+
from .extraction import extract_bytes, extract_file
|
3
4
|
|
4
5
|
__all__ = [
|
5
6
|
"ExtractionResult",
|
6
7
|
"KreuzbergError",
|
8
|
+
"Metadata",
|
9
|
+
"MissingDependencyError",
|
10
|
+
"OCRError",
|
7
11
|
"ParsingError",
|
8
12
|
"ValidationError",
|
9
13
|
"extract_bytes",
|
kreuzberg/_constants.py
ADDED
kreuzberg/_html.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
import html_to_markdown
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
|
8
|
+
from kreuzberg import ExtractionResult
|
9
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
|
+
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
+
from kreuzberg._sync import run_sync
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
|
17
|
+
async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
|
18
|
+
"""Extract text from an HTML string.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
file_path_or_contents: The HTML content.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
The extracted text content.
|
25
|
+
"""
|
26
|
+
content = (
|
27
|
+
safe_decode(file_path_or_contents)
|
28
|
+
if isinstance(file_path_or_contents, bytes)
|
29
|
+
else await AsyncPath(file_path_or_contents).read_text()
|
30
|
+
)
|
31
|
+
result = await run_sync(html_to_markdown.convert_to_markdown, content)
|
32
|
+
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_mime_types.py
CHANGED
@@ -1,16 +1,30 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from mimetypes import guess_type
|
4
|
+
from pathlib import Path
|
3
5
|
from typing import TYPE_CHECKING, Final
|
4
6
|
|
7
|
+
from kreuzberg.exceptions import ValidationError
|
8
|
+
|
5
9
|
if TYPE_CHECKING: # pragma: no cover
|
6
10
|
from collections.abc import Mapping
|
11
|
+
from os import PathLike
|
7
12
|
|
8
13
|
HTML_MIME_TYPE: Final = "text/html"
|
9
14
|
MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
10
15
|
PDF_MIME_TYPE: Final = "application/pdf"
|
11
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
|
+
# Excel formats
|
13
19
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
20
|
+
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
21
|
+
EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
|
22
|
+
EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macroEnabled.12"
|
23
|
+
EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
|
24
|
+
EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
|
25
|
+
|
26
|
+
# OpenDocument spreadsheet format
|
27
|
+
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet" # ods
|
14
28
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
15
29
|
|
16
30
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -85,9 +99,103 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
|
85
99
|
"text/x-rst",
|
86
100
|
}
|
87
101
|
|
102
|
+
SPREADSHEET_MIME_TYPES: Final[set[str]] = {
|
103
|
+
EXCEL_MIME_TYPE,
|
104
|
+
EXCEL_BINARY_MIME_TYPE,
|
105
|
+
EXCEL_MACRO_MIME_TYPE,
|
106
|
+
EXCEL_BINARY_2007_MIME_TYPE,
|
107
|
+
EXCEL_ADDON_MIME_TYPE,
|
108
|
+
EXCEL_TEMPLATE_MIME_TYPE,
|
109
|
+
OPENDOC_SPREADSHEET_MIME_TYPE,
|
110
|
+
}
|
111
|
+
|
112
|
+
EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
|
113
|
+
".txt": PLAIN_TEXT_MIME_TYPE,
|
114
|
+
".md": MARKDOWN_MIME_TYPE,
|
115
|
+
".pdf": PDF_MIME_TYPE,
|
116
|
+
".html": HTML_MIME_TYPE,
|
117
|
+
".htm": HTML_MIME_TYPE,
|
118
|
+
".xlsx": EXCEL_MIME_TYPE,
|
119
|
+
".xls": EXCEL_BINARY_MIME_TYPE,
|
120
|
+
".xlsm": EXCEL_MACRO_MIME_TYPE,
|
121
|
+
".xlsb": EXCEL_BINARY_2007_MIME_TYPE,
|
122
|
+
".xlam": EXCEL_ADDON_MIME_TYPE,
|
123
|
+
".xla": EXCEL_TEMPLATE_MIME_TYPE,
|
124
|
+
".ods": OPENDOC_SPREADSHEET_MIME_TYPE,
|
125
|
+
".pptx": POWER_POINT_MIME_TYPE,
|
126
|
+
".bmp": "image/bmp",
|
127
|
+
".gif": "image/gif",
|
128
|
+
".jpg": "image/jpeg",
|
129
|
+
".jpeg": "image/jpeg",
|
130
|
+
".png": "image/png",
|
131
|
+
".tiff": "image/tiff",
|
132
|
+
".tif": "image/tiff",
|
133
|
+
".webp": "image/webp",
|
134
|
+
".jp2": "image/jp2",
|
135
|
+
".jpx": "image/jpx",
|
136
|
+
".jpm": "image/jpm",
|
137
|
+
".mj2": "image/mj2",
|
138
|
+
".pnm": "image/x-portable-anymap",
|
139
|
+
".pbm": "image/x-portable-bitmap",
|
140
|
+
".pgm": "image/x-portable-graymap",
|
141
|
+
".ppm": "image/x-portable-pixmap",
|
142
|
+
".csv": "text/csv",
|
143
|
+
".tsv": "text/tab-separated-values",
|
144
|
+
".rst": "text/x-rst",
|
145
|
+
".org": "text/x-org",
|
146
|
+
".epub": "application/epub+zip",
|
147
|
+
".rtf": "application/rtf",
|
148
|
+
".odt": "application/vnd.oasis.opendocument.text",
|
149
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
150
|
+
".bib": "application/x-bibtex",
|
151
|
+
".ipynb": "application/x-ipynb+json",
|
152
|
+
".tex": "application/x-latex",
|
153
|
+
}
|
154
|
+
|
88
155
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
89
156
|
PLAIN_TEXT_MIME_TYPES
|
90
157
|
| IMAGE_MIME_TYPES
|
91
158
|
| PANDOC_SUPPORTED_MIME_TYPES
|
92
|
-
|
|
159
|
+
| SPREADSHEET_MIME_TYPES
|
160
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
93
161
|
)
|
162
|
+
|
163
|
+
|
164
|
+
def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
|
165
|
+
"""Validate and detect the MIME type for a given file.
|
166
|
+
|
167
|
+
Args:
|
168
|
+
file_path: The path to the file.
|
169
|
+
mime_type: Optional explicit MIME type. If provided, this will be validated.
|
170
|
+
If not provided, the function will attempt to detect the MIME type.
|
171
|
+
|
172
|
+
Raises:
|
173
|
+
ValidationError: If the MIME type is not supported or cannot be determined.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
The validated MIME type.
|
177
|
+
"""
|
178
|
+
path = Path(file_path)
|
179
|
+
|
180
|
+
if not mime_type:
|
181
|
+
# Try to determine MIME type from file extension first
|
182
|
+
ext = path.suffix.lower()
|
183
|
+
mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
|
184
|
+
|
185
|
+
if not mime_type: # pragma: no cover
|
186
|
+
raise ValidationError(
|
187
|
+
"Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
|
188
|
+
context={"input_file": str(path), "extension": ext},
|
189
|
+
)
|
190
|
+
|
191
|
+
if mime_type in SUPPORTED_MIME_TYPES:
|
192
|
+
return mime_type
|
193
|
+
|
194
|
+
for supported_mime_type in SUPPORTED_MIME_TYPES:
|
195
|
+
if mime_type.startswith(supported_mime_type):
|
196
|
+
return supported_mime_type
|
197
|
+
|
198
|
+
raise ValidationError(
|
199
|
+
f"Unsupported mime type: {mime_type}",
|
200
|
+
context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
|
201
|
+
)
|
kreuzberg/_pandoc.py
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import subprocess
|
4
|
-
|
5
|
-
from
|
4
|
+
import sys
|
5
|
+
from functools import partial
|
6
6
|
from json import JSONDecodeError, loads
|
7
|
-
from
|
8
|
-
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
|
7
|
+
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
9
8
|
|
9
|
+
from anyio import CapacityLimiter, create_task_group, to_process
|
10
10
|
from anyio import Path as AsyncPath
|
11
11
|
|
12
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
13
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
12
14
|
from kreuzberg._string import normalize_spaces
|
13
15
|
from kreuzberg._sync import run_sync
|
16
|
+
from kreuzberg._tmp import create_temp_file
|
17
|
+
from kreuzberg._types import ExtractionResult, Metadata
|
14
18
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
19
|
|
16
20
|
if TYPE_CHECKING: # pragma: no cover
|
17
21
|
from collections.abc import Mapping
|
18
22
|
from os import PathLike
|
19
23
|
|
20
|
-
|
21
|
-
from
|
22
|
-
|
23
|
-
from typing_extensions import NotRequired
|
24
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
25
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
|
+
|
24
27
|
|
25
28
|
version_ref: Final[dict[str, bool]] = {"checked": False}
|
26
29
|
|
@@ -145,65 +148,6 @@ MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
|
|
145
148
|
}
|
146
149
|
|
147
150
|
|
148
|
-
class Metadata(TypedDict, total=False):
|
149
|
-
"""Document metadata extracted from Pandoc document.
|
150
|
-
|
151
|
-
All fields are optional but will only be included if they contain non-empty values.
|
152
|
-
Any field that would be empty or None is omitted from the dictionary.
|
153
|
-
"""
|
154
|
-
|
155
|
-
title: NotRequired[str]
|
156
|
-
"""Document title."""
|
157
|
-
subtitle: NotRequired[str]
|
158
|
-
"""Document subtitle."""
|
159
|
-
abstract: NotRequired[str | list[str]]
|
160
|
-
"""Document abstract, summary or description."""
|
161
|
-
authors: NotRequired[list[str]]
|
162
|
-
"""List of document authors."""
|
163
|
-
date: NotRequired[str]
|
164
|
-
"""Document date as string to preserve original format."""
|
165
|
-
subject: NotRequired[str]
|
166
|
-
"""Document subject or topic."""
|
167
|
-
description: NotRequired[str]
|
168
|
-
"""Extended description."""
|
169
|
-
keywords: NotRequired[list[str]]
|
170
|
-
"""Keywords or tags."""
|
171
|
-
categories: NotRequired[list[str]]
|
172
|
-
"""Categories or classifications."""
|
173
|
-
version: NotRequired[str]
|
174
|
-
"""Version identifier."""
|
175
|
-
language: NotRequired[str]
|
176
|
-
"""Document language code."""
|
177
|
-
references: NotRequired[list[str]]
|
178
|
-
"""Reference entries."""
|
179
|
-
citations: NotRequired[list[str]]
|
180
|
-
"""Citation identifiers."""
|
181
|
-
copyright: NotRequired[str]
|
182
|
-
"""Copyright information."""
|
183
|
-
license: NotRequired[str]
|
184
|
-
"""License information."""
|
185
|
-
identifier: NotRequired[str]
|
186
|
-
"""Document identifier."""
|
187
|
-
publisher: NotRequired[str]
|
188
|
-
"""Publisher name."""
|
189
|
-
contributors: NotRequired[list[str]]
|
190
|
-
"""Additional contributors."""
|
191
|
-
creator: NotRequired[str]
|
192
|
-
"""Document creator."""
|
193
|
-
institute: NotRequired[str | list[str]]
|
194
|
-
"""Institute or organization."""
|
195
|
-
|
196
|
-
|
197
|
-
@dataclass
|
198
|
-
class PandocResult:
|
199
|
-
"""Result of a pandoc conversion including content and metadata."""
|
200
|
-
|
201
|
-
content: str
|
202
|
-
"""The processed markdown content."""
|
203
|
-
metadata: Metadata
|
204
|
-
"""Document metadata extracted from the source."""
|
205
|
-
|
206
|
-
|
207
151
|
def _extract_inline_text(node: dict[str, Any]) -> str | None:
|
208
152
|
if node_type := node.get(TYPE_FIELD):
|
209
153
|
if node_type == INLINE_STR:
|
@@ -246,13 +190,14 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
|
|
246
190
|
if node_type == META_LIST:
|
247
191
|
results = []
|
248
192
|
for value in [value for item in content if (value := _extract_meta_value(item))]:
|
249
|
-
if isinstance(value, list):
|
250
|
-
results.extend(value)
|
193
|
+
if isinstance(value, list): # pragma: no cover
|
194
|
+
results.extend(value)
|
251
195
|
else:
|
252
196
|
results.append(value)
|
253
197
|
return results
|
254
198
|
|
255
|
-
|
199
|
+
# This branch is only taken for complex metadata blocks which we don't use
|
200
|
+
if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]: # pragma: no cover
|
256
201
|
block_texts = []
|
257
202
|
for block in blocks:
|
258
203
|
block_content = block.get(CONTENT_FIELD, [])
|
@@ -317,134 +262,142 @@ async def _validate_pandoc_version() -> None:
|
|
317
262
|
raise MissingDependencyError("Pandoc is not installed.") from e
|
318
263
|
|
319
264
|
|
320
|
-
async def _handle_extract_metadata(
|
265
|
+
async def _handle_extract_metadata(
|
266
|
+
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
267
|
+
) -> Metadata:
|
321
268
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
269
|
+
metadata_file, unlink = await create_temp_file(".json")
|
270
|
+
try:
|
271
|
+
command = [
|
272
|
+
"pandoc",
|
273
|
+
str(input_file),
|
274
|
+
f"--from={pandoc_type}",
|
275
|
+
"--to=json",
|
276
|
+
"--standalone",
|
277
|
+
"--quiet",
|
278
|
+
"--output",
|
279
|
+
metadata_file,
|
280
|
+
]
|
322
281
|
|
323
|
-
|
324
|
-
|
325
|
-
command
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
capture_output=True,
|
340
|
-
)
|
341
|
-
|
342
|
-
if result.returncode != 0:
|
343
|
-
raise ParsingError(
|
344
|
-
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
345
|
-
)
|
346
|
-
|
347
|
-
json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
|
348
|
-
return _extract_metadata(json_data)
|
349
|
-
|
350
|
-
except (RuntimeError, OSError, JSONDecodeError) as e:
|
351
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
352
|
-
|
353
|
-
finally:
|
354
|
-
metadata_file.close()
|
355
|
-
await AsyncPath(metadata_file.name).unlink()
|
282
|
+
result = await to_process.run_sync(
|
283
|
+
partial(subprocess.run, capture_output=True),
|
284
|
+
command,
|
285
|
+
cancellable=True,
|
286
|
+
limiter=CapacityLimiter(max_processes),
|
287
|
+
)
|
288
|
+
|
289
|
+
if result.returncode != 0:
|
290
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
291
|
+
|
292
|
+
json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
|
293
|
+
return _extract_metadata(json_data)
|
294
|
+
except (RuntimeError, OSError, JSONDecodeError) as e:
|
295
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
296
|
+
finally:
|
297
|
+
await unlink()
|
356
298
|
|
357
299
|
|
358
300
|
async def _handle_extract_file(
|
359
|
-
input_file: str | PathLike[str], *, mime_type: str,
|
301
|
+
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
360
302
|
) -> str:
|
361
303
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
304
|
+
output_path, unlink = await create_temp_file(".md")
|
305
|
+
try:
|
306
|
+
command = [
|
307
|
+
"pandoc",
|
308
|
+
str(input_file),
|
309
|
+
f"--from={pandoc_type}",
|
310
|
+
"--to=markdown",
|
311
|
+
"--standalone",
|
312
|
+
"--wrap=preserve",
|
313
|
+
"--quiet",
|
314
|
+
]
|
315
|
+
|
316
|
+
command.extend(["--output", str(output_path)])
|
317
|
+
|
318
|
+
result = await to_process.run_sync(
|
319
|
+
partial(subprocess.run, capture_output=True),
|
320
|
+
command,
|
321
|
+
cancellable=True,
|
322
|
+
limiter=CapacityLimiter(max_processes),
|
323
|
+
)
|
324
|
+
|
325
|
+
if result.returncode != 0:
|
326
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
327
|
+
|
328
|
+
text = await AsyncPath(output_path).read_text("utf-8")
|
329
|
+
|
330
|
+
return normalize_spaces(text)
|
331
|
+
except (RuntimeError, OSError) as e:
|
332
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
333
|
+
finally:
|
334
|
+
await unlink()
|
362
335
|
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
str(input_file),
|
368
|
-
f"--from={pandoc_type}",
|
369
|
-
"--to=markdown",
|
370
|
-
"--standalone",
|
371
|
-
"--wrap=preserve",
|
372
|
-
"--quiet",
|
373
|
-
"--output",
|
374
|
-
output_file.name,
|
375
|
-
]
|
376
|
-
|
377
|
-
if extra_args:
|
378
|
-
command.extend(extra_args)
|
379
|
-
|
380
|
-
result = await run_sync(
|
381
|
-
subprocess.run,
|
382
|
-
command,
|
383
|
-
capture_output=True,
|
384
|
-
)
|
385
|
-
|
386
|
-
if result.returncode != 0:
|
387
|
-
raise ParsingError(
|
388
|
-
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
389
|
-
)
|
390
|
-
|
391
|
-
text = await AsyncPath(output_file.name).read_text("utf-8")
|
392
|
-
|
393
|
-
return normalize_spaces(text)
|
394
|
-
|
395
|
-
except (RuntimeError, OSError) as e:
|
396
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
397
|
-
|
398
|
-
finally:
|
399
|
-
output_file.close()
|
400
|
-
await AsyncPath(output_file.name).unlink()
|
401
|
-
|
402
|
-
|
403
|
-
async def process_file(
|
404
|
-
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
|
405
|
-
) -> PandocResult:
|
336
|
+
|
337
|
+
async def process_file_with_pandoc(
|
338
|
+
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
339
|
+
) -> ExtractionResult:
|
406
340
|
"""Process a single file using Pandoc and convert to markdown.
|
407
341
|
|
408
342
|
Args:
|
409
343
|
input_file: The path to the file to process.
|
410
344
|
mime_type: The mime type of the file.
|
411
|
-
|
345
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
346
|
+
|
347
|
+
Raises:
|
348
|
+
ParsingError: If the file data could not be extracted.
|
412
349
|
|
413
350
|
Returns:
|
414
|
-
|
351
|
+
ExtractionResult
|
415
352
|
"""
|
416
353
|
await _validate_pandoc_version()
|
417
354
|
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
355
|
+
_get_pandoc_type_from_mime_type(mime_type)
|
356
|
+
|
357
|
+
metadata: Metadata = {}
|
358
|
+
content: str = ""
|
359
|
+
|
360
|
+
try:
|
361
|
+
async with create_task_group() as tg:
|
362
|
+
|
363
|
+
async def _get_metadata() -> None:
|
364
|
+
nonlocal metadata
|
365
|
+
metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
|
366
|
+
|
367
|
+
async def _get_content() -> None:
|
368
|
+
nonlocal content
|
369
|
+
content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
|
370
|
+
|
371
|
+
tg.start_soon(_get_metadata)
|
372
|
+
tg.start_soon(_get_content)
|
373
|
+
except ExceptionGroup as eg:
|
374
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
|
375
|
+
|
376
|
+
return ExtractionResult(
|
377
|
+
content=normalize_spaces(content),
|
378
|
+
metadata=metadata,
|
379
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
427
380
|
)
|
428
381
|
|
429
382
|
|
430
|
-
async def
|
383
|
+
async def process_content_with_pandoc(
|
384
|
+
content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
385
|
+
) -> ExtractionResult:
|
431
386
|
"""Process content using Pandoc and convert to markdown.
|
432
387
|
|
433
388
|
Args:
|
434
389
|
content: The content to process.
|
435
390
|
mime_type: The mime type of the content.
|
436
|
-
|
391
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
437
392
|
|
438
393
|
Returns:
|
439
|
-
|
394
|
+
ExtractionResult
|
440
395
|
"""
|
441
396
|
extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
|
397
|
+
input_file, unlink = await create_temp_file(f".{extension}")
|
442
398
|
|
443
|
-
|
444
|
-
|
445
|
-
await AsyncPath(input_file.name).write_bytes(content)
|
446
|
-
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
399
|
+
await AsyncPath(input_file).write_bytes(content)
|
400
|
+
result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
|
447
401
|
|
448
|
-
|
449
|
-
|
450
|
-
await AsyncPath(input_file.name).unlink()
|
402
|
+
await unlink()
|
403
|
+
return result
|